############################################################################
#####################        function define     ###########################
############################################################################
cccol <- c("#CE0013","#16557A","#C7A609","#87C232","#64C0AB","#A14C94","#15A08C","#8B7E75","#1E7CAF","#EA425F","#46489A","#E50033","#0F231F","#1187CD")

error.bar <- function(x, y, upper, lower=upper, length=0.1,...){
  if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper))
    stop("vectors must be same length")
  arrows(x,y+upper, x, y-lower, angle=90, code=3, lwd=2, length=length, ...)
}

SpecificGene <- function(x,given_time,mean_cutoff){
    # find whether this gene is specific expressed in the given time point
    if (mean(x) >= mean_cutoff & x[given_time]==max(x)){
        return (x[given_time]/mean(x))
    }
    else{
        return (NA)
    }
}
############################################################################
#####################           read in data     ###########################
############################################################################
data <- read.table("../data/nsmb.2660-S2.txt",header=T,row.names=1)
Oocyte <- 1:3; Zygote <- 4:6; cell2 <- 7:12; cell4 <- 13:24; cell8 <- 25:44; Morula <- 45:60; 
MTE <- c(64,66,67,69,72,76:79);
PTE <- c(61:63,65,68,70,71,81,82); 
PE <- c(84:90);
EPI <- c(73:75,80,83);
hESC0 <- 91:98; hESC10 <- 99:124
avg <- cbind(apply(data[,Oocyte],1,mean),apply(data[,Zygote],1,mean),apply(data[,cell2],1,mean),apply(data[,cell4],1,mean),apply(data[,cell8],1,mean),apply(data[,Morula],1,mean),apply(data[,MTE],1,mean),apply(data[,PTE],1,mean),apply(data[,PE],1,mean),apply(data[,EPI],1,mean),apply(data[,hESC0],1,mean),apply(data[,hESC10],1,mean))
time_point <- c("Oocyte","Zygote","X2cell","X4cell","X8cell","Morula","MTE","PTE","PE","EPI","hESC0","hESC10")
dev_labels <- c("Oocyte","Zygote","2cell","4cell","8cell","Morula","MTE","PTE","PE","EPI","hESC0","hESC10")
colnames(avg) <- time_point
development_path <- time_point
dData <- log2(avg+1)

############ 2nd naive RNAseq
logfpkm2nd <- read.table("../data/2nd.reprogramming.lg2.all.fpkm.txt",header=T,row.names=1)
n_path <- c("hiF_r1","hiF_r2","he0_r1","he0_r2","he2_r1","he2_r2","he6_r1","he6_r2","n8_r1","n8_r2","n8_r3","n12_r1","n12_r2","n14_r1","n14_r2","n14_r3","n20_r1","n20_r2","n20_r3","n24p_r1","n24p_r2","n24m_r1","n24m_r2","niPS_r1","niPS_r2")
nData_tmp <- logfpkm2nd[,n_path]
nfpkm2nd <- 2**nData_tmp - 1

n_time_point <- c("hiF","he0","he2","he6","n8","n12","n14","n20","n24p","n24m","niPS")
n_label <- c("hiF-T","0d","2d","6d","8d","12d","14d","20d","24d+dox","24d-dox","niPSC-T")
nData2ndfpkm <- cbind(apply(nfpkm2nd[,1:2],1,mean),apply(nfpkm2nd[,3:4],1,mean),apply(nfpkm2nd[,5:6],1,mean),apply(nfpkm2nd[,7:8],1,mean),apply(nfpkm2nd[,9:11],1,mean),apply(nfpkm2nd[,12:13],1,mean),apply(nfpkm2nd[,14:16],1,mean),apply(nfpkm2nd[,17:19],1,mean),apply(nfpkm2nd[,20:21],1,mean),apply(nfpkm2nd[,22:23],1,mean),apply(nfpkm2nd[,24:25],1,mean))
colnames(nData2ndfpkm) <- n_time_point
rownames(nData2ndfpkm) <- rownames(nfpkm2nd)
nData <- log2(nData2ndfpkm+1)

###### late epiblast
data <- read.table("../data/late_epiblast_fpkm.txt",header=T,row.names=1)
M_PGC_4W_e3 <- c(1:6)
F_PGC_4W_e1 <- c(7:8)
F_PGC_4W_e2 <- c(9:18)
M_PGC_7W_e1 <- c(19:34)
M_PGC_7W_e2 <- c(35:44)
F_PGC_8W_e1 <- c(45:62)
M_Soma_7W_e1 <- c(63:75)
M_Soma_7W_e2 <- c(76:82)
M_Soma_7W_e3 <- c(83:88)
F_Soma_4W_e2 <- c(89:94)
F_Soma_8W_e1 <- c(95:104)

e_avg <- cbind(apply(data[,M_PGC_4W_e3],1,mean),apply(data[,F_PGC_4W_e1],1,mean),apply(data[,F_PGC_4W_e2],1,mean),apply(data[,F_Soma_4W_e2],1,mean),apply(data[,M_PGC_7W_e1],1,mean),apply(data[,M_PGC_7W_e2],1,mean),apply(data[,F_PGC_8W_e1],1,mean),apply(data[,M_Soma_7W_e1],1,mean),apply(data[,M_Soma_7W_e2],1,mean),apply(data[,M_Soma_7W_e3],1,mean),apply(data[,F_Soma_8W_e1],1,mean))
eData <- log2(e_avg+1)
e_path <- c("M_PGC_4W_e3","F_PGC_4W_e1","F_PGC_4W_e2","F_Soma_4W_e2","M_PGC_7W_e1","M_PGC_7W_e2","F_PGC_8W_e1","M_Soma_7W_e1","M_Soma_7W_e2","M_Soma_7W_e3","F_Soma_8W_e1")
colnames(eData) <- e_path
rownames(eData) <- rownames(data)


# # normalize
# library(edgeR)
# genes <- intersect(row.names(nData),row.names(eData))
# all_data <- cbind(logfpkm2nd[genes,n_path],eData[genes,e_path])
# batch <- as.factor(c(rep(1,length(n_path)),rep(2,length(e_path))))
# rmbatch_data <- removeBatchEffect(all_data,batch=batch)

# nData <- cbind(apply(rmbatch_data[,1:2],1,mean),apply(rmbatch_data[,3:4],1,mean),apply(rmbatch_data[,5:6],1,mean),apply(rmbatch_data[,7:8],1,mean),apply(rmbatch_data[,9:11],1,mean),apply(rmbatch_data[,12:13],1,mean),apply(rmbatch_data[,14:16],1,mean),apply(rmbatch_data[,17:19],1,mean),apply(rmbatch_data[,20:21],1,mean),apply(rmbatch_data[,22:23],1,mean),apply(rmbatch_data[,24:25],1,mean))
# colnames(nData) <- n_time_point
# rownames(nData) <- genes
# eData <- rmbatch_data[genes,e_path]

# nData[nData<0] = 0
# eData[eData<0] = 0

# normalize
library(edgeR)
d_path <- c("Oocyte","Zygote","X2cell","X4cell","X8cell","Morula","MTE","PTE","PE","EPI")
genes <- intersect(row.names(dData),row.names(eData))
all_data <- cbind(dData[genes,d_path],eData[genes,e_path])
batch <- as.factor(c(rep(1,length(d_path)),rep(2,length(e_path))))
rmbatch_data <- removeBatchEffect(all_data,batch=batch)

dData <- rmbatch_data[genes,d_path]
colnames(dData) <- d_path
eData <- rmbatch_data[genes,e_path]
colnames(eData) <- e_path

dData[dData<0] = 0
eData[eData<0] = 0

############################################################################
##############           specific eight genes             ##################
############################################################################
develop_8cell <- apply(avg,1,SpecificGene,"X8cell",1)
develop_8cell_gene <- names(sort(develop_8cell,decreasing=T)[1:500])
# write.table(cbind(develop_8cell_gene),"8c.specific.500.genes",col.names=F,row.names=F,quote=F)

cluster_8cell_genes <- as.vector(read.table("../data/kmcluster_36_naive2nd.txt")[,1])
write.table(round(avg[cluster_8cell_genes,],3),"cluster_8c_genes_fpkm.txt",col.names=T,row.names=T,quote=F,sep="\t")
# write.table(round(avg[develop_8cell_gene,],3),"specific_8c_genes_fpkm.txt",col.names=T,row.names=T,quote=F,sep="\t")


C2_genes <- as.vector(read.table("../Fig2/Cluster/14cluster_2_gene.txt")[,1])
d_eight_genes <- intersect(C2_genes,develop_8cell_gene)
c_eight_genes <- intersect(C2_genes,cluster_8cell_genes)

############################################################################
##############                      plot                  ##################
############################################################################
pdf("Fig3A.pdf",width=6,height=4)
par(mar=c(6,4,4,2))
xmax <- ncol(avg)
v1 = apply(avg[develop_8cell_gene,],2,mean)
n <- length(develop_8cell_gene)
sd <- apply(avg[develop_8cell_gene,],2,sd)
alpha <- 0.05
v2 = v1 - sd/sqrt(n)*qt(1-alpha/2,n-1)
v3 = v1 + sd/sqrt(n)*qt(1-alpha/2,n-1)
plot(v1,lwd=3,type="l",col=cccol[1],ylim=c(0,max(v1,v2,v3)),xlim=c(1,xmax),xaxt="n",main="8C-gene in development (top500)",ylab="fpkm",xlab="",las=2)
axis(1,at=seq(xmax),labels=dev_labels,las=2)
polygon(c(1,1:ncol(avg),ncol(avg):2),c(v2[1],v3,v2[ncol(avg):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)

par(mar=c(6,4,4,2))
xmax <- ncol(avg)
v1 = apply(avg[cluster_8cell_genes,],2,mean)
n <- length(cluster_8cell_genes)
sd <- apply(avg[cluster_8cell_genes,],2,sd)
alpha <- 0.05
v2 = v1 - sd/sqrt(n)*qt(1-alpha/2,n-1)
v3 = v1 + sd/sqrt(n)*qt(1-alpha/2,n-1)
plot(v1,lwd=3,type="l",col=cccol[1],ylim=c(0,max(v1,v2,v3)),xlim=c(1,xmax),xaxt="n",main="8C-gene in development (cluster)",ylab="fpkm",xlab="",las=2)
axis(1,at=seq(xmax),labels=dev_labels,las=2)
polygon(c(1,1:ncol(avg),ncol(avg):2),c(v2[1],v3,v2[ncol(avg):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)

par(mar=c(6,4,4,2))
xmax <- ncol(avg)
v1 = apply(avg[d_eight_genes,],2,mean)
n <- length(d_eight_genes)
sd <- apply(avg[d_eight_genes,],2,sd)
alpha <- 0.05
v2 = v1 - sd/sqrt(n)*qt(1-alpha/2,n-1)
v3 = v1 + sd/sqrt(n)*qt(1-alpha/2,n-1)
plot(v1,lwd=3,type="l",col=cccol[1],ylim=c(0,max(v1,v2,v3)),xlim=c(1,xmax),xaxt="n",main="8C-gene in development (top500 overlap with C2)",ylab="fpkm",xlab="",las=2)
axis(1,at=seq(xmax),labels=dev_labels,las=2)
polygon(c(1,1:ncol(avg),ncol(avg):2),c(v2[1],v3,v2[ncol(avg):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)

par(mar=c(6,4,4,2))
xmax <- ncol(avg)
v1 = apply(avg[c_eight_genes,],2,mean)
n <- length(c_eight_genes)
sd <- apply(avg[c_eight_genes,],2,sd)
alpha <- 0.05
v2 = v1 - sd/sqrt(n)*qt(1-alpha/2,n-1)
v3 = v1 + sd/sqrt(n)*qt(1-alpha/2,n-1)
plot(v1,lwd=3,type="l",col=cccol[1],ylim=c(0,max(v1,v2,v3)),xlim=c(1,xmax),xaxt="n",main="8C-gene in development (cluster overlap with C2)",ylab="fpkm",xlab="",las=2)
axis(1,at=seq(xmax),labels=dev_labels,las=2)
polygon(c(1,1:ncol(avg),ncol(avg):2),c(v2[1],v3,v2[ncol(avg):2]),col=adjustcolor("grey", alpha.f = 0.4),border=NA)

dev.off()


pdf("Fig3A_8CgenesInLateEpiblast.pdf",width=5,height=5)
par(mar=c(8,4,4,2))
tmp_genes <- intersect(rownames(eData),cluster_8cell_genes)
plot_data <- cbind(dData[tmp_genes,],eData[tmp_genes,])
boxplot(plot_data,col="white",outline=F,pch=19,border=cccol[1],las=2,ylab="log(fpkm+1)")

tmp_genes <- intersect(rownames(eData),c_eight_genes)
plot_data <- cbind(dData[tmp_genes,],eData[tmp_genes,])
boxplot(plot_data,col="white",outline=F,pch=19,border=cccol[1],las=2,ylab="log(fpkm+1)")
dev.off()
